/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.db; import java.io.*; import java.net.*; import java.util.*; import java.util.logging.*; import java.net.MalformedURLException; import javax.xml.parsers.*; import org.xml.sax.*; import org.xml.sax.helpers.*; import org.apache.xerces.util.XMLChar; import net.nutch.io.*; import net.nutch.net.*; import net.nutch.util.*; import net.nutch.pagedb.*; import net.nutch.linkdb.*; import net.nutch.util.NutchConf; /********************************************* * This class takes a flat file of URLs and adds * them as entries into a pagedb. Useful for * bootstrapping the system. * * @author Mike Cafarella * @author Doug Cutting *********************************************/ public class WebDBInjector { private static final String DMOZ_PAGENAME = "http://www.dmoz.org/"; private static final byte DEFAULT_INTERVAL = (byte)NutchConf.getInt("db.default.fetch.interval", 30); private static final float NEW_INJECTED_PAGE_SCORE = NutchConf.getFloat("db.score.injected", 2.0f); public static final Logger LOG = LogFormatter.getLogger("net.nutch.db.WebDBInjector"); /** * This filter fixes characters that might offend our parser. * This lets us be tolerant of errors that might appear in the input XML. */ private static class XMLCharFilter extends FilterReader { private boolean lastBad = false; public XMLCharFilter(Reader reader) { super(reader); } public int read() throws IOException { int c = in.read(); int value = c; if (c != -1 && !(XMLChar.isValid(c))) // fix invalid characters value = 'X'; else if (lastBad && c == '<') { // fix mis-matched brackets in.mark(1); if (in.read() != '/') value = 'X'; in.reset(); } lastBad = (c == 65533); return value; } public int read(char[] cbuf, int off, int len) throws IOException { int n = in.read(cbuf, off, len); if (n != -1) { for (int i = 0; i < n; i++) { char c = cbuf[off+i]; char value = c; if (!(XMLChar.isValid(c))) // fix invalid characters value = 'X'; else if (lastBad && c == '<') { // fix mis-matched brackets if (i != n-1 && cbuf[off+i+1] != '/') value = 'X'; } lastBad = (c == 65533); cbuf[off+i] = value; } } return n; } } /** * The RDFProcessor receives tag messages during a parse * of RDF XML data. We build whatever structures we need * from these messages. */ class RDFProcessor extends DefaultHandler { String curURL = null, curSection = null; boolean titlePending = false, descPending = false, insideAdultSection = false; StringBuffer title = new StringBuffer(), desc = new StringBuffer(); XMLReader reader; int subsetDenom; int hashSkew; boolean includeAdult, includeDmozDesc; MD5Hash srcDmozID; long srcDmozDomainID; Locator location; /** * Pass in an XMLReader, plus a flag as to whether we * should include adult material. */ public RDFProcessor(XMLReader reader, int subsetDenom, boolean includeAdult, boolean includeDmozDesc, int skew) throws IOException { this.reader = reader; this.subsetDenom = subsetDenom; this.includeAdult = includeAdult; this.includeDmozDesc = includeDmozDesc; // We create a Page entry for the "Dmoz" page, from // which all descriptive links originate. The name // of this page is always the same, stored in // DMOZ_PAGENAME. The MD5 is generated over the current // timestamp. Until this page is deleted, the descriptive // links will always be kept. // // If the DMOZ page is updated with new content, you // *could* update these links, if you really wanted to. // Just run inject again! This will replace the old // Dmoz Page, because we always keep the same name. // That obsolete Page will be deleted, and all its // outlinks (the descriptive ones) garbage-collected. // // Then we just proceed to add the new descriptive // links, with the brand-new page's src MD5. // this.srcDmozID = MD5Hash.digest(DMOZ_PAGENAME + "_" + nextFetch); Page dmozPage = new Page(DMOZ_PAGENAME, srcDmozID); dmozPage.setNextFetchTime(Long.MAX_VALUE); dbWriter.addPageIfNotPresent(dmozPage); this.srcDmozDomainID = MD5Hash.digest(new URL(DMOZ_PAGENAME).getHost()).halfDigest(); this.hashSkew = skew != 0 ? skew : new Random().nextInt(); } // // Interface ContentHandler // /** * Start of an XML elt */ public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { if ("Topic".equals(qName)) { curSection = atts.getValue("r:id"); } else if ("ExternalPage".equals(qName)) { // Porn filter if ((! includeAdult) && curSection.startsWith("Top/Adult")) { return; } // Subset denominator filter. // Only emit with a chance of 1/denominator. String url = atts.getValue("about"); int hashValue = MD5Hash.digest(url).hashCode(); hashValue = Math.abs(hashValue ^ hashSkew); if ((hashValue % subsetDenom) != 0) { return; } // We actually claim the URL! curURL = url; } else if (curURL != null && "d:Title".equals(qName)) { titlePending = true; } else if (curURL != null && "d:Description".equals(qName)) { descPending = true; } } /** * The contents of an XML elt */ public void characters(char ch[], int start, int length) { if (titlePending) { title.append(ch, start, length); } else if (descPending) { desc.append(ch, start, length); } } /** * Termination of XML elt */ public void endElement(String namespaceURI, String localName, String qName) throws SAXException { if (curURL != null) { if ("ExternalPage".equals(qName)) { // // Inc the number of pages, insert the page, and // possibly print status. // try { // First, manufacture the Page entry for the // given DMOZ listing. if (addPage(curURL)) { // Second, add a link from the DMOZ page TO the // just-added target Page. The anchor text should // be the merged Title and Desc that we get from // the DMOZ listing. For testing reasons, the // caller may choose to disallow this. if (includeDmozDesc) { String fullDesc = title + " " + desc; Link descLink = new Link(srcDmozID, srcDmozDomainID, curURL, fullDesc); dbWriter.addLink(descLink); } pages++; } } catch (MalformedURLException e) { LOG.fine("skipping " + curURL + ":" + e); } catch (IOException ie) { LOG.severe("problem adding url " + curURL + ": " + ie); } printStatusBar(2000, 50000); // // Clear out the link text. This is what // you would use for adding to the linkdb. // if (title.length() > 0) { title.delete(0, title.length()); } if (desc.length() > 0) { desc.delete(0, desc.length()); } // Null out the URL. curURL = null; } else if ("d:Title".equals(qName)) { titlePending = false; } else if ("d:Description".equals(qName)) { descPending = false; } } } /** * When parsing begins */ public void startDocument() { LOG.info("Begin parse"); } /** * When parsing ends */ public void endDocument() { LOG.info("Completed parse. Added " + pages + " pages."); } /** * From time to time the Parser will set the "current location" * by calling this function. It's useful for emitting locations * for error messages. */ public void setDocumentLocator(Locator locator) { location = locator; } // // Interface ErrorHandler // /** * Emit the exception message */ public void error(SAXParseException spe) { LOG.severe("Error: " + spe.toString() + ": " + spe.getMessage()); spe.printStackTrace(System.out); } /** * Emit the exception message, with line numbers */ public void fatalError(SAXParseException spe) { LOG.severe("Fatal error: " + spe.toString() + ": " + spe.getMessage()); LOG.severe("Last known line is " + location.getLineNumber() + ", column " + location.getColumnNumber()); spe.printStackTrace(System.out); } /** * Emit exception warning message */ public void warning(SAXParseException spe) { LOG.warning("Warning: " + spe.toString() + ": " + spe.getMessage()); spe.printStackTrace(System.out); } } private IWebDBWriter dbWriter; /** * WebDBInjector takes a reference to a WebDBWriter that it should add to. */ public WebDBInjector(IWebDBWriter dbWriter) { this.dbWriter = dbWriter; } /** * Close dbWriter and save changes */ public void close() throws IOException { dbWriter.close(); } /** * Utility to present small status bar */ public void printStatusBar(int small, int big){ if ((pages % small ) == 0) { System.out.print("."); } if ((pages % big ) == 0) { printStatus(); } } long startTime = System.currentTimeMillis(); long pages = 0; long nextFetch = System.currentTimeMillis(); /** * Utility to present performance stats */ public void printStatus(){ long elapsed = (System.currentTimeMillis() - this.startTime); if ( this.pages == 0) { } else { LOG.info("\t" + this.pages + "\t" + (int)((1000 * pages)/elapsed) + " pages/second\t" ); } } /** * Iterate through all the items in this flat text file and * add them to the db. */ public void injectURLFile(File urlList) throws IOException { nextFetch = urlList.lastModified(); BufferedReader reader = new BufferedReader(new FileReader(urlList)); try { String curStr = null; LOG.info("Starting URL processing"); while ((curStr = reader.readLine()) != null) { String url = curStr.trim(); if (addPage(url)) this.pages++; printStatusBar(2000,50000); } LOG.info("Added " + pages + " pages"); } catch (Exception e) { LOG.severe("error while injecting:" + e); } finally { reader.close(); } } /** * Iterate through all the items in this structured DMOZ file. * Add each URL to the web db. */ public void injectDmozFile(File dmozFile, int subsetDenom, boolean includeAdult, boolean includeDmozDesc, int skew) throws IOException, SAXException, ParserConfigurationException { nextFetch = dmozFile.lastModified(); SAXParserFactory parserFactory = SAXParserFactory.newInstance(); SAXParser parser = parserFactory.newSAXParser(); XMLReader reader = parser.getXMLReader(); // Create our own processor to receive SAX events RDFProcessor rp = new RDFProcessor(reader, subsetDenom, includeAdult, includeDmozDesc, skew); reader.setContentHandler(rp); reader.setErrorHandler(rp); LOG.info("skew = " + rp.hashSkew); // // Open filtered text stream. The UTF8Filter makes sure that // only appropriate XML-approved UTF8 characters are received. // Any non-conforming characters are silently skipped. // XMLCharFilter in = new XMLCharFilter(new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)), "UTF-8"))); try { InputSource is = new InputSource(in); reader.parse(is); } catch (Exception e) { LOG.severe(e.toString()); e.printStackTrace(System.out); System.exit(0); } finally { in.close(); } } private boolean addPage(String url) throws IOException { url = URLFilterFactory.getFilter().filter(url); if (url != null) { Page page = new Page(url, NEW_INJECTED_PAGE_SCORE, nextFetch); dbWriter.addPageIfNotPresent(page); return true; } return false; } /** * Command-line access. User may add URLs via a flat text file * or the structured DMOZ file. By default, we ignore Adult * material (as categorized by DMOZ). */ public static void main(String argv[]) throws Exception { if (argv.length < 3) { System.out.println("Usage: WebDBInjector <db_dir> (-urlfile <url_file> | -dmozfile <dmoz_file>) [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-noDmozDesc]"); return; } // // Parse the command line, figure out what kind of // URL file we need to load // int subsetDenom = 1; int skew = 0; String dbDir = null, command = null, loadfile = null; boolean includeAdult = false, includeDmozDesc = true; for (int i = 0; i < argv.length; i++) { if ("-urlfile".equals(argv[i]) || "-dmozfile".equals(argv[i])) { command = argv[i]; loadfile = argv[i+1]; i++; } else if ("-includeAdultMaterial".equals(argv[i])) { includeAdult = true; } else if ("-noDmozDesc".equals(argv[i])) { includeDmozDesc = false; } else if ("-subset".equals(argv[i])) { subsetDenom = Integer.parseInt(argv[i+1]); i++; } else if ("-skew".equals(argv[i])) { skew = Integer.parseInt(argv[i+1]); i++; } else { dbDir = argv[i]; } } // // Create the webdbWriter, the injector, and then inject the // right kind of URL file. // IWebDBWriter writer = new WebDBWriter(new File(dbDir)); WebDBInjector injector = new WebDBInjector(writer); if ("-urlfile".equals(command)) { injector.injectURLFile(new File(loadfile)); } else if ("-dmozfile".equals(command)) { injector.injectDmozFile(new File(loadfile), subsetDenom, includeAdult, includeDmozDesc, skew); } else { System.out.println("No command indicated."); return; } injector.close(); } }